import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import datetime
%matplotlib inline
# read in csv data
df = pd.read_csv('data/robot_data.csv')
df.head()
df.describe()
df.select_dtypes(include=['object'])
# convert datetimes column to datetime datatype
df['datetimes'] = df['datetimes'].map(lambda x: datetime.datetime.strptime(x, "%d%m%y-%H:%M:%S.%f"))
# create column of interval times
intervals = np.zeros(df.shape[0])
print(intervals.shape)
# # exploring dimension expansion
# it = np.expand_dims(intervals, axis=1)
# print(it.shape)
for i, dt in enumerate(df['datetimes']):
if i == 0:
prev_dt = dt
continue
intervals[i] = (intervals[i-1] + ((dt - prev_dt).microseconds / 1e6))
prev_dt = dt
print(intervals[:5])
# convert to series
intervals = pd.Series(intervals)
# create new column
df.insert(0, "time", intervals)
df.head()
robot_mode and safety_mode are actually categorical features, but treated as numeric.
For robot_mode:
For safety_mode:
For the purpose of our objective, which is to anticipate protective stops / fault based on based on the other features, we are only concerned with safety_mode values and not robot_mode, since we are implementing this model when the robot is always 'running' (7) or 'power on' (4).
For safety_mode, the numbers of concern to us are '1' and '3', '8' or '9'. The other modes are triggered manually.
# drop unwanted columns, also drop datetimes
df.drop(['datetimes', 'robot_mode'], axis=1, inplace=True)
df['safety_mode']
def change_to_category(x):
# if x is not normal
if x == 1:
x = 0
else:
x = 1
# if x == 1:
# x = 'normal'
# elif x == 3:
# x = 'protective_stop'
# elif x == 8:
# x = 'violation'
# elif x == 9:
# x = 'fault'
# else:
# x = 'error'
return x
df['safety_mode'] = df['safety_mode'].apply(change_to_category)
df['safety_mode'].head()
sns.countplot(y='safety_mode', data=df)
plt.show()
First, we create columns for joint acceleration
base_acc = np.zeros(df.shape[0])
shoulder_acc = np.zeros(df.shape[0])
elbow_acc = np.zeros(df.shape[0])
wrist1_acc = np.zeros(df.shape[0])
wrist2_acc = np.zeros(df.shape[0])
wrist3_acc = np.zeros(df.shape[0])
def get_acceleration(array, joint):
for i, vel in enumerate(df[f'actual_qd_{joint}']):
if i == 0:
prev_vel = vel
continue
d_time = float(df['time'][i] - df['time'][i-1])
# assume time difference is 0.008 if we get equal values
if d_time == 0.0:
d_time = 0.008
d_vel = vel - prev_vel
acc = d_vel / d_time
array[i] = acc
prev_vel = vel
return array
base_acc_series = pd.Series(get_acceleration(base_acc, 0))
shoulder_acc_series = pd.Series(get_acceleration(shoulder_acc, 1))
elbow_acc_series = pd.Series(get_acceleration(elbow_acc, 2))
wrist1_acc_series = pd.Series(get_acceleration(wrist1_acc, 3))
wrist2_acc_series = pd.Series(get_acceleration(wrist2_acc, 4))
wrist3_acc_series = pd.Series(get_acceleration(wrist3_acc, 5))
df.insert(13, 'actual_qdd_0', base_acc_series)
df.insert(14, 'actual_qdd_1', shoulder_acc_series)
df.insert(15, 'actual_qdd_2', elbow_acc_series)
df.insert(16, 'actual_qdd_3', wrist1_acc_series)
df.insert(17, 'actual_qdd_4', wrist2_acc_series)
df.insert(18, 'actual_qdd_5', wrist3_acc_series)
df.iloc[:, 13:19].head()
# correlation matrix. Only affects numerical data
correlations = df.corr()
# change color scheme
sns.set_style('white')
# generate a mask for upper triangle
mask = np.zeros_like(correlations, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# make the figsize 12x10
plt.figure(figsize=(12,10))
# plot heatmap
sns.heatmap(correlations * 100,
annot=True,
fmt='.0f',
mask=mask,
cbar=True)
plt.show()
Create a function that cleans and engineer features in the raw robot csv data.
def prepare_data(filepath):
df = pd.read_csv(filepath)
# convert datetimes column to datetime datatype
df['datetimes'] = df['datetimes'].map(lambda x: datetime.datetime.strptime(x, "%d%m%y-%H:%M:%S.%f"))
# create column of interval times
intervals = np.zeros(df.shape[0])
for i, dt in enumerate(df['datetimes']):
if i == 0:
prev_dt = dt
continue
intervals[i] = (intervals[i-1] + ((dt - prev_dt).microseconds / 1e6))
prev_dt = dt
# convert to series
intervals = pd.Series(intervals)
# create new column
df.insert(0, "time", intervals)
# change to categorical feature
def change_to_category(x):
if x == 1:
x = 0
else:
x = 1
return x
df['safety_mode'] = df['safety_mode'].apply(change_to_category)
# get accelerations
base_acc = np.zeros(df.shape[0])
shoulder_acc = np.zeros(df.shape[0])
elbow_acc = np.zeros(df.shape[0])
wrist1_acc = np.zeros(df.shape[0])
wrist2_acc = np.zeros(df.shape[0])
wrist3_acc = np.zeros(df.shape[0])
def get_acceleration(array, joint):
for i, vel in enumerate(df[f'actual_qd_{joint}']):
if i == 0:
prev_vel = vel
continue
d_time = df['time'][i] - df['time'][i-1]
if d_time == 0.0:
d_time = 0.008
d_vel = vel - prev_vel
acc = d_vel / d_time
array[i] = acc
prev_vel = vel
return array
base_acc_series = pd.Series(get_acceleration(base_acc, 0))
shoulder_acc_series = pd.Series(get_acceleration(shoulder_acc, 1))
elbow_acc_series = pd.Series(get_acceleration(elbow_acc, 2))
wrist1_acc_series = pd.Series(get_acceleration(wrist1_acc, 3))
wrist2_acc_series = pd.Series(get_acceleration(wrist2_acc, 4))
wrist3_acc_series = pd.Series(get_acceleration(wrist3_acc, 5))
df.insert(13, 'actual_qdd_0', base_acc_series)
df.insert(14, 'actual_qdd_1', shoulder_acc_series)
df.insert(15, 'actual_qdd_2', elbow_acc_series)
df.insert(16, 'actual_qdd_3', wrist1_acc_series)
df.insert(17, 'actual_qdd_4', wrist2_acc_series)
df.insert(18, 'actual_qdd_5', wrist3_acc_series)
# drop unwanted columns, also drop datetimes
df.drop(['datetimes', 'robot_mode', 'time'], axis=1, inplace=True)
# drop duplicated data
df = df.drop_duplicates()
return df
# test out function
filepath = 'data/robot_data.csv'
df_prepared = prepare_data(filepath)
df_prepared.head()
Finally, save the new DataFrame that we've cleaned and then augmented through feature engineering.
This is the dataframe that we will be building our models on.
df_prepared.to_csv('analytical_base_table.csv', index=None)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import PCA
# load the cleaned and augmented dataset
df = pd.read_csv('analytical_base_table.csv')
df.head()
PCA attempts to reduce the number of features within a dataset while retaining the 'principal components' (weighted, linear combinations of existing features that are designed to be linearly independent and account for the largest possible variance in the data).
It can be thought of as taking many features and combining similar or redundant features together to form a new, smaller feature set.
# sklearn only operates on numbers and not strings, hence prepare multi-class labels to the required format
print(pd.factorize(df['safety_mode'])[0], pd.factorize(df['safety_mode'])[1])
df['safety_mode'] = pd.Categorical(pd.factorize(df['safety_mode'])[0])
# df['safety_mode'] = pd.factorize(df['safety_mode'])[0]
pca = PCA(random_state=42)
robot_pca = pca.fit_transform(df)
3 types of model attributes are contained within the PCA model.
We are only interested in components_ and explained_variance_ratio__.
num_components = len(pca.explained_variance_ratio_)
num_components
# get the array of percentages from the fit pca object
vals = pca.explained_variance_ratio_
# get the array of cumulative captured variance
cumvals = np.cumsum(vals)
In practice, we aim for a number of principal components that can capture 80-90% variance.
# trial and error
top_n_components = 3
# get the cumulative variance from the top n components
captured_variance = cumvals[top_n_components]
print(f"The top {top_n_components} principal components captured {captured_variance} of the total variance.")
From trial-and-error, we can see that only 3 principal components for the pca-transformed dataset accounts for 95% of our total variance.
Hence it might not be necessary to perform PCA as our original data exhibits enough variance.
We will cluster the data using the KMeans algorithm to see if we can draw additional insights from the data.
kmeans_scores = []
for i in range(2, 21, 1):
kmeans = make_pipeline(StandardScaler(), MiniBatchKMeans(i, random_state=42))
kmeans.fit(df)
score = np.abs(kmeans.score(df))
print(f'For {i} clusters, average point-to-centroid distances = {score}')
kmeans_scores.append(score)
# plot a graph to visualize the change in intra-cluster distance across the different cluster number parameters
n = list(range(2, 22, 2))
cluster_df = pd.DataFrame(list(zip(kmeans_scores, n)))
# name the columns
cluster_df.columns = ['kmeans_scores', 'n']
cluster_df.plot.bar(x='n', y='kmeans_scores')
Choose 14 clusters to segment the population, since after 14 clusters, the rate of decrease in the avg distance is smaller. Hence the additional benefit from using additional clusters beyond this point is smaller due to the law of diminishing returns.
%%time
# refit the kmeans model with the selected number of clusters
kmeans = make_pipeline(StandardScaler(), MiniBatchKMeans(10, random_state=42))
# predict the clusters
clusters_robot = kmeans.fit_predict(df)
clusters_robot_df = pd.DataFrame(clusters_robot, columns=['Cluster'])
clusters_robot_df.head()
num_clusters= 10
robot_df_prop = []
for i in range(num_clusters):
robot_df_prop_i = len(clusters_robot_df[clusters_robot_df['Cluster'] == i]) / len(clusters_robot_df) * 100
robot_df_prop.append(robot_df_prop_i)
cluster_list = list(range(0, num_clusters))
prop_df = pd.DataFrame(list(zip(cluster_list, robot_df_prop)), columns=['cluster', 'proportion%'])
prop_df
y = df['safety_mode']
X = df.drop(['safety_mode'], axis=1)
# split X and y into train and test sets. Ensure usage of stratification to obtain even splits between the different classes
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
random_state=42,
stratify=y)
# pipelines
pipelines = {
'rf': make_pipeline(StandardScaler(), RandomForestClassifier(random_state=42)),
'gb': make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=42))
}
# Create hyperparameter grids
rf_hyperparameters = {
'randomforestclassifier__n_estimators': [100, 200],
'randomforestclassifier__max_features': ['auto', 'sqrt', 0.33]
}
gb_hyperparameters = {
'gradientboostingclassifier__n_estimators': [100, 200],
'gradientboostingclassifier__learning_rate': [0.05, 0.1, 0.2],
'gradientboostingclassifier__max_depth': [1, 3, 5]
}
# create hyperparameter dictionary
hyperparameters = {
'rf': rf_hyperparameters,
'gb': gb_hyperparameters
}
%%time
fitted_models = {}
# 10-fold cross validation
for name, pipeline in pipelines.items():
model = GridSearchCV(pipeline, hyperparameters[name], cv=10, n_jobs=-1)
# fit model on X_train, y_train
model.fit(X_train, y_train)
# store inside model dictionary
fitted_models[name] = model
print(f'{name} has been fitted!')
for name, model in fitted_models.items():
print(name, model.best_score_)
for name, model in fitted_models.items():
print(name, model.best_params_)
For classification tasks, we use AUROC as an evaluation metric, which handles imbalanced classes too.
# We choose gb as our go-to classifier
pred = fitted_models['gb'].predict_proba(X_test)
pred = [p[1] for p in pred]
# pred = fitted_models['gb'].predict(X_test)
fpr, tpr, thresholds = roc_curve(y_test, pred)
fig = plt.figure(figsize=(8, 8))
plt.title('ROC Curve')
# plot the ROC curve
plt.plot(fpr, tpr, label='gb')
plt.legend(loc='upper right')
# diagonal 45 degree dotted line
plt.plot([0, 1], [0, 1], 'k--')
# axes limits and labels
plt.xlim([-0.1, 1.1])
plt.ylim([-0.1, 1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
# calculate the AUROC
area_under_curve = auc(fpr, tpr)
print(f"Area under ROC curve: {area_under_curve}")
from sklearn.metrics import confusion_matrix
pred = fitted_models['gb'].predict(X_test)
# confusion matrix
print(confusion_matrix(y_test, pred))
There are 14659 true positives, and 176 true negatives.
There is 8 false positive and 17 false negatives.
This means that of all the observations that were actually positive, 14659 were predicted to be positive. Of all the observations that were actually negative, 176 were predicted to be negative.
import pickle
with open('final_model.pkl', 'wb') as f:
pickle.dump(fitted_models['gb'].best_estimator_, f)
import plotly.graph_objs as go
def prepare_data_for_plots(filepath):
df = pd.read_csv(filepath)
# convert datetimes column to datetime datatype
df['datetimes'] = df['datetimes'].map(lambda x: datetime.datetime.strptime(x, "%d%m%y-%H:%M:%S.%f"))
# create column of interval times
intervals = np.zeros(df.shape[0])
for i, dt in enumerate(df['datetimes']):
if i == 0:
prev_dt = dt
continue
intervals[i] = (intervals[i-1] + ((dt - prev_dt).microseconds / 1e6))
prev_dt = dt
# convert to series
intervals = pd.Series(intervals)
# create new column
df.insert(0, "time", intervals)
# change to categorical feature
def change_to_category(x):
if x == 1:
x = 0
else:
x = 1
return x
df['safety_mode'] = df['safety_mode'].apply(change_to_category)
# get accelerations
base_acc = np.zeros(df.shape[0])
shoulder_acc = np.zeros(df.shape[0])
elbow_acc = np.zeros(df.shape[0])
wrist1_acc = np.zeros(df.shape[0])
wrist2_acc = np.zeros(df.shape[0])
wrist3_acc = np.zeros(df.shape[0])
def get_acceleration(array, joint):
for i, vel in enumerate(df[f'actual_qd_{joint}']):
if i == 0:
prev_vel = vel
continue
d_time = df['time'][i] - df['time'][i-1]
if d_time == 0.0:
d_time = 0.008
d_vel = vel - prev_vel
acc = d_vel / d_time
array[i] = acc
prev_vel = vel
return array
base_acc_series = pd.Series(get_acceleration(base_acc, 0))
shoulder_acc_series = pd.Series(get_acceleration(shoulder_acc, 1))
elbow_acc_series = pd.Series(get_acceleration(elbow_acc, 2))
wrist1_acc_series = pd.Series(get_acceleration(wrist1_acc, 3))
wrist2_acc_series = pd.Series(get_acceleration(wrist2_acc, 4))
wrist3_acc_series = pd.Series(get_acceleration(wrist3_acc, 5))
df.insert(13, 'actual_qdd_0', base_acc_series)
df.insert(14, 'actual_qdd_1', shoulder_acc_series)
df.insert(15, 'actual_qdd_2', elbow_acc_series)
df.insert(16, 'actual_qdd_3', wrist1_acc_series)
df.insert(17, 'actual_qdd_4', wrist2_acc_series)
df.insert(18, 'actual_qdd_5', wrist3_acc_series)
# drop unwanted columns, also drop datetimes
df.drop(['datetimes', 'robot_mode'], axis=1, inplace=True)
# drop duplicated data
df = df.drop_duplicates()
return df
df = prepare_data_for_plots('../web_app/data/robot_data.csv')
time = df['time'].tolist()
# first chart plots joint currents vs time as a line chart
graph_one = []
graph_one.append([
go.Scatter(
x = time,
y = df['actual_current_0'].tolist(),
mode = 'lines',
name = 'base'),
go.Scatter(
x = time,
y = df['actual_current_1'].tolist(),
mode = 'lines',
name = 'shoulder'),
go.Scatter(
x = time,
y = df['actual_current_2'].tolist(),
mode = 'lines',
name = 'elbow'),
go.Scatter(
x = time,
y = df['actual_current_3'].tolist(),
mode = 'lines',
name = 'wrist_1'),
go.Scatter(
x = time,
y = df['actual_current_4'].tolist(),
mode = 'lines',
name = 'wrist_2'),
go.Scatter(
x = time,
y = df['actual_current_5'].tolist(),
mode = 'lines',
name = 'wrist_3')
])
layout_one = dict(title = 'Graph of Joint Currents vs Time',
xaxis = dict(title = 'Time'),
yaxis = dict(title = 'Joint Currents (A)'),
)
# sixth chart plots tcp force vs time as a line chart
graph_six = []
graph_six.append(
go.Scatter(
x = time,
y = df['tcp_force_scalar'].tolist(),
mode = 'lines',
name = 'force')
)
layout_six = dict(title = 'Graph of TCP Force vs Time',
xaxis = dict(title = 'Time'),
yaxis = dict(title = 'TCP Force(N)'),
)
# seventh chart plots percentage of the different'safety mode' occurrences as a bar chart
graph_seven = []
classes = df['safety_mode'].unique()
safety_mode_list = []
for label in classes:
safety_mode_list.append(len(df[df['safety_mode']==label]) / len(df) * 100)
graph_seven.append(
go.Bar(
x = classes,
y = safety_mode_list,
)
)
layout_seven = dict(title = 'Safety Mode Types Occurrence (%)',
xaxis = dict(title = 'Safety Mode Type',),
yaxis = dict(title = 'Occurrence (%)'),
)
fig = go.Figure()
for i, plot in enumerate(graph_six):
fig.add_trace(plot)
fig.update_layout(layout_six)
fig.show()
fig = go.Figure()
for i, plot in enumerate(graph_seven):
fig.add_trace(plot)
fig.update_layout(layout_seven)
fig.show()
Attempt to predict batched data.
def prepare_batched_data(filepath):
df = pd.read_csv(filepath)
df = df.iloc[:3, :]
# convert datetimes column to datetime datatype
df['datetimes'] = df['datetimes'].map(lambda x: datetime.datetime.strptime(x, "%d%m%y-%H:%M:%S.%f"))
# create column of interval times
intervals = np.zeros(df.shape[0])
for i, dt in enumerate(df['datetimes']):
if i == 0:
prev_dt = dt
continue
intervals[i] = (intervals[i-1] + ((dt - prev_dt).microseconds / 1e6))
prev_dt = dt
# convert to series
intervals = pd.Series(intervals)
# create new column
df.insert(0, "time", intervals)
# change to categorical feature
def change_to_category(x):
if x == 1:
x = 0
else:
x = 1
return x
df['safety_mode'] = df['safety_mode'].apply(change_to_category)
# get accelerations
base_acc = np.zeros(df.shape[0])
shoulder_acc = np.zeros(df.shape[0])
elbow_acc = np.zeros(df.shape[0])
wrist1_acc = np.zeros(df.shape[0])
wrist2_acc = np.zeros(df.shape[0])
wrist3_acc = np.zeros(df.shape[0])
def get_acceleration(array, joint):
for i, vel in enumerate(df[f'actual_qd_{joint}']):
if i == 0:
prev_vel = vel
continue
d_time = df['time'][i] - df['time'][i-1]
if d_time == 0.0:
d_time = 0.008
d_vel = vel - prev_vel
acc = d_vel / d_time
array[i] = acc
prev_vel = vel
return array
base_acc_series = pd.Series(get_acceleration(base_acc, 0))
shoulder_acc_series = pd.Series(get_acceleration(shoulder_acc, 1))
elbow_acc_series = pd.Series(get_acceleration(elbow_acc, 2))
wrist1_acc_series = pd.Series(get_acceleration(wrist1_acc, 3))
wrist2_acc_series = pd.Series(get_acceleration(wrist2_acc, 4))
wrist3_acc_series = pd.Series(get_acceleration(wrist3_acc, 5))
df.insert(13, 'actual_qdd_0', base_acc_series)
df.insert(14, 'actual_qdd_1', shoulder_acc_series)
df.insert(15, 'actual_qdd_2', elbow_acc_series)
df.insert(16, 'actual_qdd_3', wrist1_acc_series)
df.insert(17, 'actual_qdd_4', wrist2_acc_series)
df.insert(18, 'actual_qdd_5', wrist3_acc_series)
# drop unwanted columns, also drop datetimes
df.drop(['datetimes', 'robot_mode', 'time'], axis=1, inplace=True)
# drop duplicated data
df = df.drop_duplicates()
# only inputs
df = df.drop(['safety_mode'], axis=1)
return df
df = prepare_batched_data('data/robot_data.csv')
df.head()
pred= fitted_models['gb'].predict_proba(df)
pred
# probabilities for class '1' (robot error) being predicted
pred_class_1 = [p[1] for p in pred]
pred_class_1
# probabilities for class '0' (normal) being predicted
pred_class_0 = [p[0] for p in pred]
pred_class_0